WRANGLERS
Photo by Alex Alvarez on Unsplash
The World Happiness Report has proven to be an indispensable tool for policymakers
looking to better understand what makes people happy…
— Jeffrey Sachs
df <- read_xls('./archetypes/happiness-report/happiness-report-2020.xls')
df
dim(df)
## [1] 1704 26
glimpse(df)
## Rows: 1,704
## Columns: 26
## $ `Country name` <chr> "Afghanista~
## $ Year <dbl> 2008, 2009,~
## $ `Life Ladder` <dbl> 3.723590, 4~
## $ `Log GDP per capita` <dbl> 7.168690, 7~
## $ `Social support` <dbl> 0.4506623, ~
## $ `Healthy life expectancy at birth` <dbl> 50.80, 51.2~
## $ `Freedom to make life choices` <dbl> 0.7181143, ~
## $ Generosity <dbl> 0.177888572~
## $ `Perceptions of corruption` <dbl> 0.8816863, ~
## $ `Positive affect` <dbl> 0.5176372, ~
## $ `Negative affect` <dbl> 0.2581955, ~
## $ `Confidence in national government` <dbl> 0.6120721, ~
## $ `Democratic Quality` <dbl> -1.92968965~
## $ `Delivery Quality` <dbl> -1.6550844,~
## $ `Standard deviation of ladder by country-year` <dbl> 1.774662, 1~
## $ `Standard deviation/Mean of ladder by country-year` <dbl> 0.4765997, ~
## $ `GINI index (World Bank estimate)` <dbl> NA, NA, NA,~
## $ `GINI index (World Bank estimate), average 2000-16` <dbl> NA, NA, NA,~
## $ `gini of household income reported in Gallup, by wp5-year` <dbl> NA, 0.44190~
## $ `Most people can be trusted, Gallup` <dbl> NA, 0.28631~
## $ `Most people can be trusted, WVS round 1981-1984` <dbl> NA, NA, NA,~
## $ `Most people can be trusted, WVS round 1989-1993` <dbl> NA, NA, NA,~
## $ `Most people can be trusted, WVS round 1994-1998` <dbl> NA, NA, NA,~
## $ `Most people can be trusted, WVS round 1999-2004` <dbl> NA, NA, NA,~
## $ `Most people can be trusted, WVS round 2005-2009` <dbl> NA, NA, NA,~
## $ `Most people can be trusted, WVS round 2010-2014` <dbl> NA, NA, NA,~
The above provides a little more information. For example, we see that ‘Country name’ is a column of characters char, and that all other columns are numbers dbl. This is useful because we can already guess that ‘Year’ does not have the right type. It should not be treated as a number. We will fix it with the next command. Also notice the beginning values of each column; this is useful to get familiar with the data on hand. Some columns display a lot of NA, which indicates the absence of data.
df <- df %>% mutate(Year = as.factor(Year))
str(df$Year)
## Factor w/ 14 levels "2005","2006",..: 4 5 6 7 8 9 10 11 12 13 ...
glimpse(df)
## Rows: 1,704
## Columns: 26
## $ `Country name` <chr> "Afghanista~
## $ Year <fct> 2008, 2009,~
## $ `Life Ladder` <dbl> 3.723590, 4~
## $ `Log GDP per capita` <dbl> 7.168690, 7~
## $ `Social support` <dbl> 0.4506623, ~
## $ `Healthy life expectancy at birth` <dbl> 50.80, 51.2~
## $ `Freedom to make life choices` <dbl> 0.7181143, ~
## $ Generosity <dbl> 0.177888572~
## $ `Perceptions of corruption` <dbl> 0.8816863, ~
## $ `Positive affect` <dbl> 0.5176372, ~
## $ `Negative affect` <dbl> 0.2581955, ~
## $ `Confidence in national government` <dbl> 0.6120721, ~
## $ `Democratic Quality` <dbl> -1.92968965~
## $ `Delivery Quality` <dbl> -1.6550844,~
## $ `Standard deviation of ladder by country-year` <dbl> 1.774662, 1~
## $ `Standard deviation/Mean of ladder by country-year` <dbl> 0.4765997, ~
## $ `GINI index (World Bank estimate)` <dbl> NA, NA, NA,~
## $ `GINI index (World Bank estimate), average 2000-16` <dbl> NA, NA, NA,~
## $ `gini of household income reported in Gallup, by wp5-year` <dbl> NA, 0.44190~
## $ `Most people can be trusted, Gallup` <dbl> NA, 0.28631~
## $ `Most people can be trusted, WVS round 1981-1984` <dbl> NA, NA, NA,~
## $ `Most people can be trusted, WVS round 1989-1993` <dbl> NA, NA, NA,~
## $ `Most people can be trusted, WVS round 1994-1998` <dbl> NA, NA, NA,~
## $ `Most people can be trusted, WVS round 1999-2004` <dbl> NA, NA, NA,~
## $ `Most people can be trusted, WVS round 2005-2009` <dbl> NA, NA, NA,~
## $ `Most people can be trusted, WVS round 2010-2014` <dbl> NA, NA, NA,~
missing_stats <- purrr::map_df(df, ~ sum(is.na(.))) %>%
gather('Column name', 'Count of missing values')
missing_stats
distinct_df <- distinct(df,`Country name`) %>% arrange(`Country name`)
distinct_df
distinct_df <- distinct(df, Year) %>% arrange(Year)
distinct_df
df_1= table(df$Year)
df_2 <- as.data.frame(df_1) %>%
dplyr::rename(Year = Var1, Freq_absolute = Freq) %>%
mutate(Freq_relative=paste0(round(100*Freq_absolute/sum(Freq_absolute),digits=2),"%"))
df_2
df1 <- df[,3:ncol(df)]
nRows <- dim(df1)[1]
calcStats <- function(x) {
temp <- na.omit(df[, x])
pos <- sum(temp > 0)
is_zero <- sum(temp = 0)
neg <- sum(temp < 0)
c("number of positives" = pos, "negatives" = neg, "zero" = is_zero)
}
result <- as.data.frame(Map(calcStats, colnames(df1)))
result
df_long <- df %>%
pivot_longer(
`Life Ladder`:`Most people can be trusted, WVS round 2010-2014`,
names_to = "measure",
values_to = "value"
)
v1 <- ggplot(df_long, aes(x=value)) +
geom_histogram(fill = "#79B8E5") +
facet_wrap(~ measure, scales="free")+
theme(panel.grid = element_blank(),
strip.background = element_blank(),
panel.background = element_blank()
)
girafe(ggobj = v1, width_svg = 1280/72, height_svg = 720/72, options =
list(opts_sizing(rescale = TRUE, width = 1.0)))
####################################################################################
# Required libraries
library(tidyverse)
library(readxl)
####################################################################################
# Load data
load_data <- function(data_source){
read_xls(paste(getwd(),data_source,sep="/"))
}
#Call the function to load the data
load_data("./archetypes/happiness-report-2020.xls")
####################################################################################
####################################################################################
# Find dimensions
f_dimensions <- function(df_dimensions) {
dim(df_dimensions)
}
# Call the function with your values to the parameters
f_dimensions(df)
####################################################################################
####################################################################################
# Find structure: column name, column type and first rows
f_structure <- function(df_structure) {
#glimpse(df_structure)
str(df_structure)
}
# Call the function with your values to the parameters
f_structure(df)
####################################################################################
####################################################################################
# Count missing values
f_missing_values <- function(df_missing_values) {
purrr::map_df(df_missing_values, ~ sum(is.na(.))) %>%
gather('Column name', 'Count of missing values')
}
# Call the function with your values to the parameters
f_missing_values(df)
####################################################################################
####################################################################################
# Display distinct values of a column
f_distinct_values <- function(df_distinct_values,...) {
distinct(df_distinct_values,...) %>% arrange(...)
}
# Call the function with your values to the parameters
f_distinct_values(df,`Country name`)
####################################################################################
#########################################################################################
# Display frequency of the values of a column
f_frequency <- function(df_frequency,column) {
df_tmp = table(column)
as.data.frame(df_tmp) %>%
rename(Frequency_column=column ,Freq_absolute=Freq) %>%
mutate(Freq_relative=paste0(round(100*Freq_absolute/sum(Freq_absolute),digits=2),"%"))
}
# Call the function with your values to the parameters
f_frequency (df,df$Year)
#########################################################################################
#########################################################################################
# Display histograms of numerical variable
f_histogram <- function(df_histogram,column_1,column_2) {
column_name1=deparse(substitute(column_1))
column_name2=deparse(substitute(column_2))
df_long <- df_histogram %>%
pivot_longer(
column_name1:column_name2,
names_to = "measure",
values_to = "value"
)
ggplot(df_long, aes(x=value)) +
geom_histogram(fill = "#79B8E5") +
facet_wrap(~ measure, scales="free")+
theme(panel.grid = element_blank(),
strip.background = element_blank(),
panel.background = element_blank()
)
}
# Call the function with your values to the parameters
f_histogram (df,`Life Ladder`,`Most people can be trusted, WVS round 2010-2014`)
#########################################################################################